#########################################################################################################
# Data cleaning: Expanding the State
#########################################################################################################

#Read in datasets
data_ets <- read.csv('data/ets_final.csv')

#Code movements that occurred in either 1963/1964
#First year of Belaunde in office
data_ets$movement_6364 <- NA
data_ets$movement_6364[data_ets$movement_1963 + 
                         data_ets$movement_1964 > 0] <- 1
data_ets$movement_6364[data_ets$movement_1963 + 
                         data_ets$movement_1964 == 0] <- 0 

#Code municipalities where a movement occurred *before* 1963
data_ets$movement_pre63_dum <- NA
data_ets$movement_pre63_dum[data_ets$mov_1956_1962 > 0] <- 1 
data_ets$movement_pre63_dum[data_ets$mov_1956_1962 == 0] <- 0 

#Code municipalities where any movement occurred between 1956 and 1963
data_ets$any_move <- NA
data_ets$any_move[data_ets$mov_1956_1962 > 0|data_ets$movement_6364 > 0] <- 1 
data_ets$any_move[data_ets$mov_1956_1962 == 0 & data_ets$movement_6364 == 0] <- 0 

#Code primary SIPA outcome: Difference in offices pre and post-1963 (1963 is included in control)
data_ets$sipa_64_67 <- ifelse(data_ets$sipa_1964 + data_ets$sipa_1965 + 
                            data_ets$sipa_1966 + data_ets$sipa_1967 > 0, 1, 0)

data_ets$sipa_62_63 <- ifelse(data_ets$sipa_1963 + data_ets$sipa_1962 > 0, 1, 0)

data_ets$sipa_pre_post_diff <- data_ets$sipa_64_67  -
  data_ets$sipa_62_63

#Code secondary SIPA outcome (Table A11): Difference in offices pre and post-1963 (1963 is excluded)
data_ets$sipa_pre_post_diff_excl_1963 <- data_ets$sipa_64_67  -
  data_ets$sipa_1962

#Code primary index outcome: SIPA (differenced, pre-post 1963) + binary CP office in 1967
data_ets$index <- data_ets$sipa_pre_post_diff + data_ets$central_cp_67

#Code index excluding 1963 SIPA data
data_ets$index_excl_1963 <- data_ets$sipa_pre_post_diff_excl_1963 + data_ets$central_cp_67

##Long-term outcome: Land reform
#Binary measure

data_ets$land_binary <- NA
data_ets$land_binary[data_ets$landredist_pc > 0] <- 1
data_ets$land_binary[data_ets$landredist_pc == 0] <- 0

##############
## Controls ##
##############

#Code win by AP in 1963 municipal elections
data_ets$ap_win <- NA
data_ets$ap_win[data_ets$ap_place == 1] <- 1
data_ets$ap_win[data_ets$ap_place != 1] <- 0

#Code whether there are any bureaucrats 
data_ets$bureaucrat_binary <- ifelse(round(exp(data_ets$lemployees1961)) > 1, 1, 0)

#Code education variables as per capita
data_ets$students_pc <- data_ets$primaria_alumnos_62/
  data_ets$total_pop_61
data_ets$schools_pc <- data_ets$primaria_escuelas_62/
  data_ets$total_pop_61
data_ets$teachers_pc <- data_ets$primaria_maestros_62/
  data_ets$total_pop_61

#Log hacienda variable
data_ets$log_hac <- log(data_ets$haciendas + 1)


#Code hacienda population as % of rural population
data_ets$hac_size <-  data_ets$pob_rural_haciendas/(data_ets$haciendas + 0.0001)

covs <- data_ets |> 
  subset(select = c(students_pc,
                    teachers_pc,
                    schools_pc, 
                    ap_win,
                    roads_1964, total_pop_61, 
                    rural_perc_61, 
                    hac_size, log_hac,
                    bureaucrat_binary, elev_1k))

# Calculate percentage of missing values
missing_perc <- sapply(covs, function(x) mean(is.na(x)))

# Print the missing percentages (optional)
print(missing_perc)


#Following guidance of Lin, Green, and Coppock 2016
#Impute mean for missingness of <10%

# Get names of columns with less than 10% missingness
cols_to_impute <- base::names(missing_perc[missing_perc < 0.10])

# Replace NAs in those columns with the mean
for (col in cols_to_impute) {
  covs[[col]][is.na(covs[[col]])] <- mean(covs[[col]], na.rm = TRUE)
}

data_ets <- data_ets %>%
  dplyr::select(-any_of(base::names(covs))) %>%
  bind_cols(covs) 

#Find any variables with missingness >10%
names(missing_perc[missing_perc > 0.1])

#For AP win (missingness > 10%), code missing values as 0 and include covariate
#for missingness
data_ets$ap_win_missing <- as.integer(is.na(data_ets$ap_win))
data_ets$ap_win_recode <- ifelse(is.na(data_ets$ap_win), 0, data_ets$ap_win)

##Create data for parallel trends analysis

#Code categories of movements (New movement in 1963/1964, prior movement pre-1963, and no movements)
data_ets$movement_cat <- case_when(
  data_ets$movement_6364 == 1 & data_ets$movement_pre63_dum == 0 ~ "Yes (1963/1964 only)",
  data_ets$movement_pre63_dum == 1 ~ "Yes (pre-1963)",
  data_ets$movement_6364 == 0 & data_ets$movement_pre63_dum == 0 ~ "No",
  TRUE ~ NA_character_
)

#Relevel factors for plotting
data_ets$movement_cat <- factor(data_ets$movement_cat, levels = c("Yes (1963/1964 only)", 
                                                        "Yes (pre-1963)",
                                                        "No"))

data_long <- data_ets %>%
  pivot_longer(cols = starts_with("sipa_1"),  
               names_to = "outcome_type",
               values_to = "outcome_value")


data_long$year <- as.numeric(substr(data_long$outcome_type, 6, 9))

summary_data <- data_long %>%
  group_by(year, movement_cat) %>%
  summarise(mean_outcome = mean(outcome_value, na.rm = TRUE),
            se = sd(outcome_value, na.rm = TRUE) / sqrt(n()),
            ci_lower = mean_outcome - 1.96 * se,
            ci_upper = mean_outcome + 1.96 * se,
            .groups = "drop")

summary_data <- summary_data %>%
  filter(!is.na(movement_cat))

